The most basic similarity function is a comparison of what functions are called in each cell. Primarily, we look at three different possible definitions of function name:
For example, the function call in:
from sklearn import linear_model linear_model.logistic_regression()
Would be written as the following:
Lets take a look at what happens when we take the Jaccard similarity of the function calls that appear in each cell.
In [1]:
# Necessary imports
import os
import time
from nbminer.features.features import Features
from nbminer.notebook_miner import NotebookMiner
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
In [2]:
from nbminer.stats.summary import Summary
In [3]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
a = Features([NotebookMiner(n) for n in notebooks])
In [6]:
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.get_simple_features import GetSimpleFeatures
from nbminer.results.similarity.jaccard_similarity import SegmentJaccardSimilarity
# Now lets compute the jaccard similarity between each cell
gaf = GetASTFeatures()
a = gaf.transform(a)
gi = GetImports()
a = gi.transform(a)
sf = GetSimpleFeatures()
a = sf.transform(a)
s = time.time()
segJS = SegmentJaccardSimilarity()
rd, cls = segJS.transform(a)
print ('Time elapsed: ', time.time()-s)
Now that we have performed all the jaccard similarity metrics, we can start looking at the results
In [33]:
short_similarities = []
full_similarities = []
call_similarities = []
for key in rd:
short_similarities.append(rd[key]['short_similarity'])
full_similarities.append(rd[key]['full_similarity'])
call_similarities.append(rd[key]['call_similarity'])
In [34]:
import numpy as np
short_similarities = np.array(short_similarities)
full_similarities = np.array(full_similarities)
call_similarities = np.array(call_similarities)
In [35]:
print (np.mean(short_similarities))
print (np.mean(full_similarities))
print (np.mean(call_similarities))
In [36]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
In [37]:
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(short_similarities,bins=10)
axes[1].hist(full_similarities,bins=10)
axes[2].hist(call_similarities,bins=10)
Out[37]:
Looks like we have alot of zeros here, lets see how many exactly, and take them out to get a better look at the rest
In [38]:
greater_than_0 = len([i for i in short_similarities if i > 0])
total_length = len(short_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)
In [39]:
greater_than_0 = len([i for i in full_similarities if i > 0])
total_length = len(full_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)
In [40]:
greater_than_0 = len([i for i in call_similarities if i > 0])
total_length = len(call_similarities)
print ('Total length: ', total_length)
print ('Greater than 0: ', greater_than_0)
print ('Fraction greater than 0: ',greater_than_0/total_length)
In [41]:
plt.rcParams['figure.figsize'] = (20, 10)
fig, axes = plt.subplots(1,3)
axes[0].hist(np.array([i for i in short_similarities if i > 0]),bins=10)
axes[1].hist(np.array([i for i in full_similarities if i > 0]),bins=10)
axes[2].hist(np.array([i for i in call_similarities if i > 0]),bins=10)
Out[41]:
In [43]:
# Finding examples of cells that have a higher 'short call similarity' than 'long call similarity'
total_examples = 5
for key in rd:
if rd[key]['short_similarity'] > rd[key]['full_similarity'] and rd[key]['full_similarity'] > .5:
if 'head' in rd[key]['code_x'][0] or 'sum' in rd[key]['code_x'][0]: continue
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
#if 'get_ipython().magic(' in rd[key]['code_x'][0]:
# continue
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("\n\nCODE Y:")
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
Most similar
In [44]:
# Finding examples of cells that have a high full code similarity
total_examples = 5
for key in rd:
if rd[key]['full_similarity'] > .5:
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("\n\nCODE Y:")
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
In [45]:
# Finding examples of cells that have a high short code similarity
total_examples = 5
for key in rd:
if rd[key]['short_similarity'] > .5:
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("\n\nCODE Y:")
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
In [46]:
# Finding examples of cells that have a high short code similarity (ignoring get_ipython calls)
total_examples = 5
for key in rd:
if rd[key]['short_similarity'] > .5:
if 'get_ipython' in rd[key]['code_x'][0]:
continue
print ("LOOKING AT DIFFERENCE BETWEEN CODE X AND Y")
print ("CODE X:")
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("\n\nCODE Y:")
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ("\n\n\n\n\n\n")
total_examples -= 1
if total_examples == 0:
break
In [51]:
# Finding sets of functions that have a high full code similarity
total_examples = 50
for key in rd:
if rd[key]['full_similarity'] > .5:
if 'get_ipython' in rd[key]['code_x'][0]:
continue
print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
print ("SET X:")
print (rd[key]['full_similarity_x'])
print ("SET Y:")
print (rd[key]['full_similarity_y'])
total_examples -= 1
if total_examples == 0:
break
In [54]:
# I noticed that some of the cells were actually really similar, Let's zero in on some of the linear regression ones:
total_examples = 5
for key in rd:
if rd[key]['full_similarity'] > .5:
if 'sklearn.linear_model.LinearRegression' in rd[key]['full_similarity_x']:
print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
print ("SET X:")
print (rd[key]['full_similarity_x'])
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("SET Y:")
print (rd[key]['full_similarity_y'])
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ('\n\n\n\n\n\n')
total_examples -= 1
if total_examples == 0:
break
In [55]:
# Pretty short, lets look for similar cells with a high number of functions
# I noticed that some of the cells were actually really similar, Let's zero in on some of the linear regression ones:
total_examples = 5
for key in rd:
if rd[key]['full_similarity'] > .5:
if len(rd[key]['full_similarity_x']) > 10:
print ("LOOKING AT DIFFERENCE BETWEEN SETS FOR X AND Y")
print ("SET X:")
print (rd[key]['full_similarity_x'])
for line in rd[key]['code_x'][0].split('\n'):
print (line)
print ("SET Y:")
print (rd[key]['full_similarity_y'])
for line in rd[key]['code_y'][0].split('\n'):
print (line)
print ('\n\n\n\n\n\n')
total_examples -= 1
if total_examples == 0:
break
In [ ]: